In [425]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import (
RandomForestRegressor, GradientBoostingRegressor,
AdaBoostRegressor
)
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score, mean_squared_error
file1 = '/Users/fizza/path//to/Gly-data y1 copy.csv'
file2 = '/Users/fizza/path//to/Gly-data y2.csv'
df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)
data = pd.concat([df1, df2], ignore_index=True)
def clean_target(val):
if str(val).strip() in ['xx', 'xx', 'xx']:
return 0.01
try:
return float(val)
except:
return np.nan
data['y1'] = data['y1'].apply(clean_target)
data['y2'] = data['y2'].apply(clean_target)
data = data.dropna(subset=['y1', 'y2'], how='all')
data['y1'] = data['y1'].fillna(0.01)
data['y2'] = data['y2'].fillna(0.01)
X = data.drop(columns=['y1', 'y2'])
y1 = data['y1']
bool_cols = X.select_dtypes(include='bool').columns.tolist()
X[bool_cols] = X[bool_cols].astype(str)
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()
X[categorical_cols] = X[categorical_cols].fillna('missing').astype(str)
X[numerical_cols] = X[numerical_cols].apply(pd.to_numeric, errors='coerce')
X[numerical_cols] = X[numerical_cols].fillna(X[numerical_cols].mean())
cat_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())
])
preprocessor = ColumnTransformer([
('cat', cat_pipeline, categorical_cols),
('num', num_pipeline, numerical_cols)
])
# Models
models = {
"LinearRegression": LinearRegression(),
"Ridge": Ridge(),
"Lasso": Lasso(),
"SVR": SVR(),
"DecisionTree": DecisionTreeRegressor(),
"RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
"GradientBoosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
"AdaBoost": AdaBoostRegressor(n_estimators=100, random_state=42),
"XGBoost": XGBRegressor(n_estimators=100, random_state=42, verbosity=0),
"CatBoost": CatBoostRegressor(n_estimators=100, verbose=0, random_state=42)
}
X_train, X_val, y1_train, y1_val = train_test_split(X, y1, test_size=0.2, random_state=42)
results = []
for name, model in models.items():
pipeline = Pipeline([
('preprocessing', preprocessor),
('regressor', model)
])
pipeline.fit(X_train, y1_train)
y1_train_pred = pipeline.predict(X_train)
y1_val_pred = pipeline.predict(X_val)
train_r2 = r2_score(y1_train, y1_train_pred)
val_r2 = r2_score(y1_val, y1_val_pred)
train_rmse = mean_squared_error(y1_train, y1_train_pred, squared=False)
val_rmse = mean_squared_error(y1_val, y1_val_pred, squared=False)
results.append({
'Model': name,
'Train R²': train_r2,
'Validation R²': val_r2,
'Train RMSE': train_rmse,
'Validation RMSE': val_rmse
})
plt.figure(figsize=(12, 6))
sns.barplot(data=results_df.melt(id_vars='Model', value_vars=['Train R²', 'Validation R²']),
x='Model', y='value', hue='variable')
plt.title('A-Ratio: R² Scores for Train and Validation')
plt.ylabel('R² Score')
plt.xticks(rotation=45)
plt.legend(title='Dataset', loc='lower right')
plt.tight_layout()
plt.show()
In [433]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
RandomForestRegressor, GradientBoostingRegressor,
AdaBoostRegressor
)
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
models = {
"LinearRegression": LinearRegression(),
"Ridge": Ridge(),
"Lasso": Lasso(),
"SVR": SVR(),
"DecisionTree": DecisionTreeRegressor(),
"RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
"GradientBoosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
"AdaBoost": AdaBoostRegressor(n_estimators=100, random_state=42),
"XGBoost": XGBRegressor(n_estimators=100, random_state=42, verbosity=0),
"CatBoost": CatBoostRegressor(n_estimators=100, verbose=0, random_state=42)
}
fig, axes = plt.subplots(5, 2, figsize=(12, 18), dpi=600)
axes = axes.flatten()
for ax, (name, model) in zip(axes, models.items()):
pipeline = Pipeline([
('preprocessing', preprocessor),
('regressor', model)
])
pipeline.fit(X_train, y1_train)
y1_val_pred = pipeline.predict(X_val)
r2 = r2_score(y1_val, y1_val_pred)
rmse = mean_squared_error(y1_val, y1_val_pred, squared=False)
ax.scatter(y1_val, y1_val_pred, alpha=0.6, edgecolors='k')
ax.plot([y1_val.min(), y1_val.max()], [y1_val.min(), y1_val.max()], 'r--')
ax.set_title(name)
ax.set_xlabel('True α-ratio')
ax.set_ylabel('Predicted α-ratio')
ax.legend([f"R² = {r2:.2f}\nRMSE = {rmse:.2f}"], loc='lower right')
plt.tight_layout()
plt.suptitle("Validation: Actual vs Predicted α-ratio for All Models", fontsize=18, y=1.02)
plt.show()
In [435]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
RandomForestRegressor, GradientBoostingRegressor,
AdaBoostRegressor
)
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
models = {
"LinearRegression": LinearRegression(),
"Ridge": Ridge(),
"Lasso": Lasso(),
"SVR": SVR(),
"DecisionTree": DecisionTreeRegressor(),
"RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
"GradientBoosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
"AdaBoost": AdaBoostRegressor(n_estimators=100, random_state=42),
"XGBoost": XGBRegressor(n_estimators=100, random_state=42, verbosity=0),
"CatBoost": CatBoostRegressor(n_estimators=100, verbose=0, random_state=42)
}
fig, axes = plt.subplots(5, 2, figsize=(12, 18), dpi=600)
axes = axes.flatten()
for ax, (name, model) in zip(axes, models.items()):
pipeline = Pipeline([
('preprocessing', preprocessor),
('regressor', model)
])
pipeline.fit(X_train, y1_train)
y1_train_pred = pipeline.predict(X_train)
r2 = r2_score(y1_train, y1_train_pred)
rmse = mean_squared_error(y1_train, y1_train_pred, squared=False)
ax.scatter(y1_train, y1_train_pred, alpha=0.6, edgecolors='k')
ax.plot([y1_train.min(), y1_train.max()], [y1_train.min(), y1_train.max()], 'r--')
ax.set_title(name)
ax.set_xlabel('True α-ratio')
ax.set_ylabel('Predicted α-ratio')
ax.legend([f"R² = {r2:.2f}\nRMSE = {rmse:.2f}"], loc='lower right')
plt.tight_layout()
plt.suptitle("Train: Actual vs Predicted α-ratio for All Models", fontsize=18, y=1.02)
plt.show()